.text

.globl  ossl_rsaz_amm52x40_x1_avxifma256
.type   ossl_rsaz_amm52x40_x1_avxifma256,@function
.align  32
ossl_rsaz_amm52x40_x1_avxifma256:
.cfi_startproc
.byte   243,15,30,250
        pushq   %rbx
.cfi_adjust_cfa_offset  8
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_adjust_cfa_offset  8
.cfi_offset     %rbp,-24
        pushq   %r12
.cfi_adjust_cfa_offset  8
.cfi_offset     %r12,-32
        pushq   %r13
.cfi_adjust_cfa_offset  8
.cfi_offset     %r13,-40
        pushq   %r14
.cfi_adjust_cfa_offset  8
.cfi_offset     %r14,-48
        pushq   %r15
.cfi_adjust_cfa_offset  8
.cfi_offset     %r15,-56

        vpxor   %ymm0,%ymm0,%ymm0
        vmovapd %ymm0,%ymm3
        vmovapd %ymm0,%ymm4
        vmovapd %ymm0,%ymm5
        vmovapd %ymm0,%ymm6
        vmovapd %ymm0,%ymm7
        vmovapd %ymm0,%ymm8
        vmovapd %ymm0,%ymm9
        vmovapd %ymm0,%ymm10
        vmovapd %ymm0,%ymm11
        vmovapd %ymm0,%ymm12

        xorl    %r9d,%r9d

        movq    %rdx,%r11
        movq    $0xfffffffffffff,%rax


        movl    $10,%ebx

.align  32
.Lloop10:
        movq    0(%r11),%r13

        vpbroadcastq    0(%r11),%ymm1
        movq    0(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    %r8,%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    0(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     288(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     288(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        movq    8(%r11),%r13

        vpbroadcastq    8(%r11),%ymm1
        movq    0(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    %r8,%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    0(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     288(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     288(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        movq    16(%r11),%r13

        vpbroadcastq    16(%r11),%ymm1
        movq    0(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    %r8,%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    0(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     288(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     288(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        movq    24(%r11),%r13

        vpbroadcastq    24(%r11),%ymm1
        movq    0(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    %r8,%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    0(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     288(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     288(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        leaq    32(%r11),%r11
        decl    %ebx
        jne     .Lloop10

        vmovq   %r9,%xmm0
        vpbroadcastq    %xmm0,%ymm0
        vpblendd        $3,%ymm0,%ymm3,%ymm3

        leaq    -640(%rsp),%rsp
        vmovupd %ymm3,0(%rsp)
        vmovupd %ymm4,32(%rsp)
        vmovupd %ymm5,64(%rsp)
        vmovupd %ymm6,96(%rsp)
        vmovupd %ymm7,128(%rsp)
        vmovupd %ymm8,160(%rsp)
        vmovupd %ymm9,192(%rsp)
        vmovupd %ymm10,224(%rsp)
        vmovupd %ymm11,256(%rsp)
        vmovupd %ymm12,288(%rsp)



        vpsrlq  $52,%ymm3,%ymm3
        vpsrlq  $52,%ymm4,%ymm4
        vpsrlq  $52,%ymm5,%ymm5
        vpsrlq  $52,%ymm6,%ymm6
        vpsrlq  $52,%ymm7,%ymm7
        vpsrlq  $52,%ymm8,%ymm8
        vpsrlq  $52,%ymm9,%ymm9
        vpsrlq  $52,%ymm10,%ymm10
        vpsrlq  $52,%ymm11,%ymm11
        vpsrlq  $52,%ymm12,%ymm12


        vpermq  $144,%ymm12,%ymm12
        vpermq  $3,%ymm11,%ymm13
        vblendpd        $1,%ymm13,%ymm12,%ymm12

        vpermq  $144,%ymm11,%ymm11
        vpermq  $3,%ymm10,%ymm13
        vblendpd        $1,%ymm13,%ymm11,%ymm11

        vpermq  $144,%ymm10,%ymm10
        vpermq  $3,%ymm9,%ymm13
        vblendpd        $1,%ymm13,%ymm10,%ymm10

        vpermq  $144,%ymm9,%ymm9
        vpermq  $3,%ymm8,%ymm13
        vblendpd        $1,%ymm13,%ymm9,%ymm9

        vpermq  $144,%ymm8,%ymm8
        vpermq  $3,%ymm7,%ymm13
        vblendpd        $1,%ymm13,%ymm8,%ymm8

        vpermq  $144,%ymm7,%ymm7
        vpermq  $3,%ymm6,%ymm13
        vblendpd        $1,%ymm13,%ymm7,%ymm7

        vpermq  $144,%ymm6,%ymm6
        vpermq  $3,%ymm5,%ymm13
        vblendpd        $1,%ymm13,%ymm6,%ymm6

        vpermq  $144,%ymm5,%ymm5
        vpermq  $3,%ymm4,%ymm13
        vblendpd        $1,%ymm13,%ymm5,%ymm5

        vpermq  $144,%ymm4,%ymm4
        vpermq  $3,%ymm3,%ymm13
        vblendpd        $1,%ymm13,%ymm4,%ymm4

        vpermq  $144,%ymm3,%ymm3
        vpand   .Lhigh64x3(%rip),%ymm3,%ymm3

        vmovupd %ymm3,320(%rsp)
        vmovupd %ymm4,352(%rsp)
        vmovupd %ymm5,384(%rsp)
        vmovupd %ymm6,416(%rsp)
        vmovupd %ymm7,448(%rsp)
        vmovupd %ymm8,480(%rsp)
        vmovupd %ymm9,512(%rsp)
        vmovupd %ymm10,544(%rsp)
        vmovupd %ymm11,576(%rsp)
        vmovupd %ymm12,608(%rsp)

        vmovupd 0(%rsp),%ymm3
        vmovupd 32(%rsp),%ymm4
        vmovupd 64(%rsp),%ymm5
        vmovupd 96(%rsp),%ymm6
        vmovupd 128(%rsp),%ymm7
        vmovupd 160(%rsp),%ymm8
        vmovupd 192(%rsp),%ymm9
        vmovupd 224(%rsp),%ymm10
        vmovupd 256(%rsp),%ymm11
        vmovupd 288(%rsp),%ymm12


        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9
        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12


        vpaddq  320(%rsp),%ymm3,%ymm3
        vpaddq  352(%rsp),%ymm4,%ymm4
        vpaddq  384(%rsp),%ymm5,%ymm5
        vpaddq  416(%rsp),%ymm6,%ymm6
        vpaddq  448(%rsp),%ymm7,%ymm7
        vpaddq  480(%rsp),%ymm8,%ymm8
        vpaddq  512(%rsp),%ymm9,%ymm9
        vpaddq  544(%rsp),%ymm10,%ymm10
        vpaddq  576(%rsp),%ymm11,%ymm11
        vpaddq  608(%rsp),%ymm12,%ymm12

        leaq    640(%rsp),%rsp



        vpcmpgtq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r14d
        vpcmpgtq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r13d
        shlb    $4,%r13b
        orb     %r13b,%r14b

        vpcmpgtq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r13d
        vpcmpgtq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%r12d
        shlb    $4,%r12b
        orb     %r12b,%r13b

        vpcmpgtq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%r12d
        vpcmpgtq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%r11d
        shlb    $4,%r11b
        orb     %r11b,%r12b

        vpcmpgtq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%r11d
        vpcmpgtq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%r10d
        shlb    $4,%r10b
        orb     %r10b,%r11b

        vpcmpgtq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%r10d
        vpcmpgtq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%r9d
        shlb    $4,%r9b
        orb     %r9b,%r10b

        addb    %r14b,%r14b
        adcb    %r13b,%r13b
        adcb    %r12b,%r12b
        adcb    %r11b,%r11b
        adcb    %r10b,%r10b


        vpcmpeqq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r9d
        vpcmpeqq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r8d
        shlb    $4,%r8b
        orb     %r8b,%r9b

        vpcmpeqq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r8d
        vpcmpeqq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%edx
        shlb    $4,%dl
        orb     %dl,%r8b

        vpcmpeqq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%edx
        vpcmpeqq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%ecx
        shlb    $4,%cl
        orb     %cl,%dl

        vpcmpeqq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%ecx
        vpcmpeqq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%ebx
        shlb    $4,%bl
        orb     %bl,%cl

        vpcmpeqq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%ebx
        vpcmpeqq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%eax
        shlb    $4,%al
        orb     %al,%bl

        addb    %r9b,%r14b
        adcb    %r8b,%r13b
        adcb    %dl,%r12b
        adcb    %cl,%r11b
        adcb    %bl,%r10b

        xorb    %r9b,%r14b
        xorb    %r8b,%r13b
        xorb    %dl,%r12b
        xorb    %cl,%r11b
        xorb    %bl,%r10b

        pushq   %r9
        pushq   %r8

        leaq    .Lkmasklut(%rip),%r8

        movb    %r14b,%r9b
        andq    $0xf,%r14
        vpsubq  .Lmask52x4(%rip),%ymm3,%ymm13
        shlq    $5,%r14
        vmovapd (%r8,%r14,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm3,%ymm3

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm4,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm4,%ymm4

        movb    %r13b,%r9b
        andq    $0xf,%r13
        vpsubq  .Lmask52x4(%rip),%ymm5,%ymm13
        shlq    $5,%r13
        vmovapd (%r8,%r13,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm5,%ymm5

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm6,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm6,%ymm6

        movb    %r12b,%r9b
        andq    $0xf,%r12
        vpsubq  .Lmask52x4(%rip),%ymm7,%ymm13
        shlq    $5,%r12
        vmovapd (%r8,%r12,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm7,%ymm7

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm8,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm8,%ymm8

        movb    %r11b,%r9b
        andq    $0xf,%r11
        vpsubq  .Lmask52x4(%rip),%ymm9,%ymm13
        shlq    $5,%r11
        vmovapd (%r8,%r11,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm9,%ymm9

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm10,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm10,%ymm10

        movb    %r10b,%r9b
        andq    $0xf,%r10
        vpsubq  .Lmask52x4(%rip),%ymm11,%ymm13
        shlq    $5,%r10
        vmovapd (%r8,%r10,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm11,%ymm11

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm12,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm12,%ymm12

        popq    %r8
        popq    %r9

        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9

        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12

        vmovdqu %ymm3,0(%rdi)
        vmovdqu %ymm4,32(%rdi)
        vmovdqu %ymm5,64(%rdi)
        vmovdqu %ymm6,96(%rdi)
        vmovdqu %ymm7,128(%rdi)
        vmovdqu %ymm8,160(%rdi)
        vmovdqu %ymm9,192(%rdi)
        vmovdqu %ymm10,224(%rdi)
        vmovdqu %ymm11,256(%rdi)
        vmovdqu %ymm12,288(%rdi)

        vzeroupper
        leaq    (%rsp),%rax
.cfi_def_cfa_register   %rax
        movq    0(%rax),%r15
.cfi_restore    %r15
        movq    8(%rax),%r14
.cfi_restore    %r14
        movq    16(%rax),%r13
.cfi_restore    %r13
        movq    24(%rax),%r12
.cfi_restore    %r12
        movq    32(%rax),%rbp
.cfi_restore    %rbp
        movq    40(%rax),%rbx
.cfi_restore    %rbx
        leaq    48(%rax),%rsp
.cfi_def_cfa    %rsp,8
.Lossl_rsaz_amm52x40_x1_avxifma256_epilogue:

        .byte   0xf3,0xc3
.cfi_endproc
.size   ossl_rsaz_amm52x40_x1_avxifma256, .-ossl_rsaz_amm52x40_x1_avxifma256
.section        .rodata
.align  32
.Lmask52x4:
.quad   0xfffffffffffff
.quad   0xfffffffffffff
.quad   0xfffffffffffff
.quad   0xfffffffffffff
.Lhigh64x3:
.quad   0x0
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.Lkmasklut:

.quad   0x0
.quad   0x0
.quad   0x0
.quad   0x0

.quad   0xffffffffffffffff
.quad   0x0
.quad   0x0
.quad   0x0

.quad   0x0
.quad   0xffffffffffffffff
.quad   0x0
.quad   0x0

.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0x0
.quad   0x0

.quad   0x0
.quad   0x0
.quad   0xffffffffffffffff
.quad   0x0

.quad   0xffffffffffffffff
.quad   0x0
.quad   0xffffffffffffffff
.quad   0x0

.quad   0x0
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0x0

.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0x0

.quad   0x0
.quad   0x0
.quad   0x0
.quad   0xffffffffffffffff

.quad   0xffffffffffffffff
.quad   0x0
.quad   0x0
.quad   0xffffffffffffffff

.quad   0x0
.quad   0xffffffffffffffff
.quad   0x0
.quad   0xffffffffffffffff

.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0x0
.quad   0xffffffffffffffff

.quad   0x0
.quad   0x0
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff

.quad   0xffffffffffffffff
.quad   0x0
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff

.quad   0x0
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff

.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.quad   0xffffffffffffffff
.text

.globl  ossl_rsaz_amm52x40_x2_avxifma256
.type   ossl_rsaz_amm52x40_x2_avxifma256,@function
.align  32
ossl_rsaz_amm52x40_x2_avxifma256:
.cfi_startproc
.byte   243,15,30,250
        pushq   %rbx
.cfi_adjust_cfa_offset  8
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_adjust_cfa_offset  8
.cfi_offset     %rbp,-24
        pushq   %r12
.cfi_adjust_cfa_offset  8
.cfi_offset     %r12,-32
        pushq   %r13
.cfi_adjust_cfa_offset  8
.cfi_offset     %r13,-40
        pushq   %r14
.cfi_adjust_cfa_offset  8
.cfi_offset     %r14,-48
        pushq   %r15
.cfi_adjust_cfa_offset  8
.cfi_offset     %r15,-56

        vpxor   %ymm0,%ymm0,%ymm0
        vmovapd %ymm0,%ymm3
        vmovapd %ymm0,%ymm4
        vmovapd %ymm0,%ymm5
        vmovapd %ymm0,%ymm6
        vmovapd %ymm0,%ymm7
        vmovapd %ymm0,%ymm8
        vmovapd %ymm0,%ymm9
        vmovapd %ymm0,%ymm10
        vmovapd %ymm0,%ymm11
        vmovapd %ymm0,%ymm12

        xorl    %r9d,%r9d

        movq    %rdx,%r11
        movq    $0xfffffffffffff,%rax

        movl    $40,%ebx

.align  32
.Lloop40:
        movq    0(%r11),%r13

        vpbroadcastq    0(%r11),%ymm1
        movq    0(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    (%r8),%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    0(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     288(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     0(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     32(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     64(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     96(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     128(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     160(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     192(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     224(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     256(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     288(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     0(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     32(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     64(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     96(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     128(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     160(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     192(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     224(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     256(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     288(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        leaq    8(%r11),%r11
        decl    %ebx
        jne     .Lloop40

        pushq   %r11
        pushq   %rsi
        pushq   %rcx
        pushq   %r8

        vmovq   %r9,%xmm0
        vpbroadcastq    %xmm0,%ymm0
        vpblendd        $3,%ymm0,%ymm3,%ymm3

        leaq    -640(%rsp),%rsp
        vmovupd %ymm3,0(%rsp)
        vmovupd %ymm4,32(%rsp)
        vmovupd %ymm5,64(%rsp)
        vmovupd %ymm6,96(%rsp)
        vmovupd %ymm7,128(%rsp)
        vmovupd %ymm8,160(%rsp)
        vmovupd %ymm9,192(%rsp)
        vmovupd %ymm10,224(%rsp)
        vmovupd %ymm11,256(%rsp)
        vmovupd %ymm12,288(%rsp)



        vpsrlq  $52,%ymm3,%ymm3
        vpsrlq  $52,%ymm4,%ymm4
        vpsrlq  $52,%ymm5,%ymm5
        vpsrlq  $52,%ymm6,%ymm6
        vpsrlq  $52,%ymm7,%ymm7
        vpsrlq  $52,%ymm8,%ymm8
        vpsrlq  $52,%ymm9,%ymm9
        vpsrlq  $52,%ymm10,%ymm10
        vpsrlq  $52,%ymm11,%ymm11
        vpsrlq  $52,%ymm12,%ymm12


        vpermq  $144,%ymm12,%ymm12
        vpermq  $3,%ymm11,%ymm13
        vblendpd        $1,%ymm13,%ymm12,%ymm12

        vpermq  $144,%ymm11,%ymm11
        vpermq  $3,%ymm10,%ymm13
        vblendpd        $1,%ymm13,%ymm11,%ymm11

        vpermq  $144,%ymm10,%ymm10
        vpermq  $3,%ymm9,%ymm13
        vblendpd        $1,%ymm13,%ymm10,%ymm10

        vpermq  $144,%ymm9,%ymm9
        vpermq  $3,%ymm8,%ymm13
        vblendpd        $1,%ymm13,%ymm9,%ymm9

        vpermq  $144,%ymm8,%ymm8
        vpermq  $3,%ymm7,%ymm13
        vblendpd        $1,%ymm13,%ymm8,%ymm8

        vpermq  $144,%ymm7,%ymm7
        vpermq  $3,%ymm6,%ymm13
        vblendpd        $1,%ymm13,%ymm7,%ymm7

        vpermq  $144,%ymm6,%ymm6
        vpermq  $3,%ymm5,%ymm13
        vblendpd        $1,%ymm13,%ymm6,%ymm6

        vpermq  $144,%ymm5,%ymm5
        vpermq  $3,%ymm4,%ymm13
        vblendpd        $1,%ymm13,%ymm5,%ymm5

        vpermq  $144,%ymm4,%ymm4
        vpermq  $3,%ymm3,%ymm13
        vblendpd        $1,%ymm13,%ymm4,%ymm4

        vpermq  $144,%ymm3,%ymm3
        vpand   .Lhigh64x3(%rip),%ymm3,%ymm3

        vmovupd %ymm3,320(%rsp)
        vmovupd %ymm4,352(%rsp)
        vmovupd %ymm5,384(%rsp)
        vmovupd %ymm6,416(%rsp)
        vmovupd %ymm7,448(%rsp)
        vmovupd %ymm8,480(%rsp)
        vmovupd %ymm9,512(%rsp)
        vmovupd %ymm10,544(%rsp)
        vmovupd %ymm11,576(%rsp)
        vmovupd %ymm12,608(%rsp)

        vmovupd 0(%rsp),%ymm3
        vmovupd 32(%rsp),%ymm4
        vmovupd 64(%rsp),%ymm5
        vmovupd 96(%rsp),%ymm6
        vmovupd 128(%rsp),%ymm7
        vmovupd 160(%rsp),%ymm8
        vmovupd 192(%rsp),%ymm9
        vmovupd 224(%rsp),%ymm10
        vmovupd 256(%rsp),%ymm11
        vmovupd 288(%rsp),%ymm12


        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9
        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12


        vpaddq  320(%rsp),%ymm3,%ymm3
        vpaddq  352(%rsp),%ymm4,%ymm4
        vpaddq  384(%rsp),%ymm5,%ymm5
        vpaddq  416(%rsp),%ymm6,%ymm6
        vpaddq  448(%rsp),%ymm7,%ymm7
        vpaddq  480(%rsp),%ymm8,%ymm8
        vpaddq  512(%rsp),%ymm9,%ymm9
        vpaddq  544(%rsp),%ymm10,%ymm10
        vpaddq  576(%rsp),%ymm11,%ymm11
        vpaddq  608(%rsp),%ymm12,%ymm12

        leaq    640(%rsp),%rsp



        vpcmpgtq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r14d
        vpcmpgtq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r13d
        shlb    $4,%r13b
        orb     %r13b,%r14b

        vpcmpgtq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r13d
        vpcmpgtq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%r12d
        shlb    $4,%r12b
        orb     %r12b,%r13b

        vpcmpgtq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%r12d
        vpcmpgtq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%r11d
        shlb    $4,%r11b
        orb     %r11b,%r12b

        vpcmpgtq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%r11d
        vpcmpgtq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%r10d
        shlb    $4,%r10b
        orb     %r10b,%r11b

        vpcmpgtq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%r10d
        vpcmpgtq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%r9d
        shlb    $4,%r9b
        orb     %r9b,%r10b

        addb    %r14b,%r14b
        adcb    %r13b,%r13b
        adcb    %r12b,%r12b
        adcb    %r11b,%r11b
        adcb    %r10b,%r10b


        vpcmpeqq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r9d
        vpcmpeqq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r8d
        shlb    $4,%r8b
        orb     %r8b,%r9b

        vpcmpeqq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r8d
        vpcmpeqq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%edx
        shlb    $4,%dl
        orb     %dl,%r8b

        vpcmpeqq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%edx
        vpcmpeqq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%ecx
        shlb    $4,%cl
        orb     %cl,%dl

        vpcmpeqq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%ecx
        vpcmpeqq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%ebx
        shlb    $4,%bl
        orb     %bl,%cl

        vpcmpeqq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%ebx
        vpcmpeqq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%eax
        shlb    $4,%al
        orb     %al,%bl

        addb    %r9b,%r14b
        adcb    %r8b,%r13b
        adcb    %dl,%r12b
        adcb    %cl,%r11b
        adcb    %bl,%r10b

        xorb    %r9b,%r14b
        xorb    %r8b,%r13b
        xorb    %dl,%r12b
        xorb    %cl,%r11b
        xorb    %bl,%r10b

        pushq   %r9
        pushq   %r8

        leaq    .Lkmasklut(%rip),%r8

        movb    %r14b,%r9b
        andq    $0xf,%r14
        vpsubq  .Lmask52x4(%rip),%ymm3,%ymm13
        shlq    $5,%r14
        vmovapd (%r8,%r14,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm3,%ymm3

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm4,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm4,%ymm4

        movb    %r13b,%r9b
        andq    $0xf,%r13
        vpsubq  .Lmask52x4(%rip),%ymm5,%ymm13
        shlq    $5,%r13
        vmovapd (%r8,%r13,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm5,%ymm5

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm6,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm6,%ymm6

        movb    %r12b,%r9b
        andq    $0xf,%r12
        vpsubq  .Lmask52x4(%rip),%ymm7,%ymm13
        shlq    $5,%r12
        vmovapd (%r8,%r12,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm7,%ymm7

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm8,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm8,%ymm8

        movb    %r11b,%r9b
        andq    $0xf,%r11
        vpsubq  .Lmask52x4(%rip),%ymm9,%ymm13
        shlq    $5,%r11
        vmovapd (%r8,%r11,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm9,%ymm9

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm10,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm10,%ymm10

        movb    %r10b,%r9b
        andq    $0xf,%r10
        vpsubq  .Lmask52x4(%rip),%ymm11,%ymm13
        shlq    $5,%r10
        vmovapd (%r8,%r10,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm11,%ymm11

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm12,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm12,%ymm12

        popq    %r8
        popq    %r9

        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9

        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12

        popq    %r8
        popq    %rcx
        popq    %rsi
        popq    %r11

        vmovdqu %ymm3,0(%rdi)
        vmovdqu %ymm4,32(%rdi)
        vmovdqu %ymm5,64(%rdi)
        vmovdqu %ymm6,96(%rdi)
        vmovdqu %ymm7,128(%rdi)
        vmovdqu %ymm8,160(%rdi)
        vmovdqu %ymm9,192(%rdi)
        vmovdqu %ymm10,224(%rdi)
        vmovdqu %ymm11,256(%rdi)
        vmovdqu %ymm12,288(%rdi)

        xorl    %r15d,%r15d

        movq    $0xfffffffffffff,%rax

        movl    $40,%ebx

        vpxor   %ymm0,%ymm0,%ymm0
        vmovapd %ymm0,%ymm3
        vmovapd %ymm0,%ymm4
        vmovapd %ymm0,%ymm5
        vmovapd %ymm0,%ymm6
        vmovapd %ymm0,%ymm7
        vmovapd %ymm0,%ymm8
        vmovapd %ymm0,%ymm9
        vmovapd %ymm0,%ymm10
        vmovapd %ymm0,%ymm11
        vmovapd %ymm0,%ymm12
.align  32
.Lloop40_1:
        movq    0(%r11),%r13

        vpbroadcastq    0(%r11),%ymm1
        movq    320(%rsi),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        movq    %r12,%r10
        adcq    $0,%r10

        movq    8(%r8),%r13
        imulq   %r9,%r13
        andq    %rax,%r13

        vmovq   %r13,%xmm2
        vpbroadcastq    %xmm2,%ymm2
        movq    320(%rcx),%rdx
        mulxq   %r13,%r13,%r12
        addq    %r13,%r9
        adcq    %r12,%r10

        shrq    $52,%r9
        salq    $12,%r10
        orq     %r10,%r9

        leaq    -328(%rsp),%rsp

{vex}   vpmadd52luq     320(%rsi),%ymm1,%ymm3
{vex}   vpmadd52luq     352(%rsi),%ymm1,%ymm4
{vex}   vpmadd52luq     384(%rsi),%ymm1,%ymm5
{vex}   vpmadd52luq     416(%rsi),%ymm1,%ymm6
{vex}   vpmadd52luq     448(%rsi),%ymm1,%ymm7
{vex}   vpmadd52luq     480(%rsi),%ymm1,%ymm8
{vex}   vpmadd52luq     512(%rsi),%ymm1,%ymm9
{vex}   vpmadd52luq     544(%rsi),%ymm1,%ymm10
{vex}   vpmadd52luq     576(%rsi),%ymm1,%ymm11
{vex}   vpmadd52luq     608(%rsi),%ymm1,%ymm12

{vex}   vpmadd52luq     320(%rcx),%ymm2,%ymm3
{vex}   vpmadd52luq     352(%rcx),%ymm2,%ymm4
{vex}   vpmadd52luq     384(%rcx),%ymm2,%ymm5
{vex}   vpmadd52luq     416(%rcx),%ymm2,%ymm6
{vex}   vpmadd52luq     448(%rcx),%ymm2,%ymm7
{vex}   vpmadd52luq     480(%rcx),%ymm2,%ymm8
{vex}   vpmadd52luq     512(%rcx),%ymm2,%ymm9
{vex}   vpmadd52luq     544(%rcx),%ymm2,%ymm10
{vex}   vpmadd52luq     576(%rcx),%ymm2,%ymm11
{vex}   vpmadd52luq     608(%rcx),%ymm2,%ymm12
        vmovdqu %ymm3,0(%rsp)
        vmovdqu %ymm4,32(%rsp)
        vmovdqu %ymm5,64(%rsp)
        vmovdqu %ymm6,96(%rsp)
        vmovdqu %ymm7,128(%rsp)
        vmovdqu %ymm8,160(%rsp)
        vmovdqu %ymm9,192(%rsp)
        vmovdqu %ymm10,224(%rsp)
        vmovdqu %ymm11,256(%rsp)
        vmovdqu %ymm12,288(%rsp)
        movq    $0,320(%rsp)

        vmovdqu 8(%rsp),%ymm3
        vmovdqu 40(%rsp),%ymm4
        vmovdqu 72(%rsp),%ymm5
        vmovdqu 104(%rsp),%ymm6
        vmovdqu 136(%rsp),%ymm7
        vmovdqu 168(%rsp),%ymm8
        vmovdqu 200(%rsp),%ymm9
        vmovdqu 232(%rsp),%ymm10
        vmovdqu 264(%rsp),%ymm11
        vmovdqu 296(%rsp),%ymm12

        addq    8(%rsp),%r9

{vex}   vpmadd52huq     320(%rsi),%ymm1,%ymm3
{vex}   vpmadd52huq     352(%rsi),%ymm1,%ymm4
{vex}   vpmadd52huq     384(%rsi),%ymm1,%ymm5
{vex}   vpmadd52huq     416(%rsi),%ymm1,%ymm6
{vex}   vpmadd52huq     448(%rsi),%ymm1,%ymm7
{vex}   vpmadd52huq     480(%rsi),%ymm1,%ymm8
{vex}   vpmadd52huq     512(%rsi),%ymm1,%ymm9
{vex}   vpmadd52huq     544(%rsi),%ymm1,%ymm10
{vex}   vpmadd52huq     576(%rsi),%ymm1,%ymm11
{vex}   vpmadd52huq     608(%rsi),%ymm1,%ymm12

{vex}   vpmadd52huq     320(%rcx),%ymm2,%ymm3
{vex}   vpmadd52huq     352(%rcx),%ymm2,%ymm4
{vex}   vpmadd52huq     384(%rcx),%ymm2,%ymm5
{vex}   vpmadd52huq     416(%rcx),%ymm2,%ymm6
{vex}   vpmadd52huq     448(%rcx),%ymm2,%ymm7
{vex}   vpmadd52huq     480(%rcx),%ymm2,%ymm8
{vex}   vpmadd52huq     512(%rcx),%ymm2,%ymm9
{vex}   vpmadd52huq     544(%rcx),%ymm2,%ymm10
{vex}   vpmadd52huq     576(%rcx),%ymm2,%ymm11
{vex}   vpmadd52huq     608(%rcx),%ymm2,%ymm12
        leaq    328(%rsp),%rsp
        leaq    8(%r11),%r11
        decl    %ebx
        jne     .Lloop40_1

        vmovq   %r9,%xmm0
        vpbroadcastq    %xmm0,%ymm0
        vpblendd        $3,%ymm0,%ymm3,%ymm3

        leaq    -640(%rsp),%rsp
        vmovupd %ymm3,0(%rsp)
        vmovupd %ymm4,32(%rsp)
        vmovupd %ymm5,64(%rsp)
        vmovupd %ymm6,96(%rsp)
        vmovupd %ymm7,128(%rsp)
        vmovupd %ymm8,160(%rsp)
        vmovupd %ymm9,192(%rsp)
        vmovupd %ymm10,224(%rsp)
        vmovupd %ymm11,256(%rsp)
        vmovupd %ymm12,288(%rsp)



        vpsrlq  $52,%ymm3,%ymm3
        vpsrlq  $52,%ymm4,%ymm4
        vpsrlq  $52,%ymm5,%ymm5
        vpsrlq  $52,%ymm6,%ymm6
        vpsrlq  $52,%ymm7,%ymm7
        vpsrlq  $52,%ymm8,%ymm8
        vpsrlq  $52,%ymm9,%ymm9
        vpsrlq  $52,%ymm10,%ymm10
        vpsrlq  $52,%ymm11,%ymm11
        vpsrlq  $52,%ymm12,%ymm12


        vpermq  $144,%ymm12,%ymm12
        vpermq  $3,%ymm11,%ymm13
        vblendpd        $1,%ymm13,%ymm12,%ymm12

        vpermq  $144,%ymm11,%ymm11
        vpermq  $3,%ymm10,%ymm13
        vblendpd        $1,%ymm13,%ymm11,%ymm11

        vpermq  $144,%ymm10,%ymm10
        vpermq  $3,%ymm9,%ymm13
        vblendpd        $1,%ymm13,%ymm10,%ymm10

        vpermq  $144,%ymm9,%ymm9
        vpermq  $3,%ymm8,%ymm13
        vblendpd        $1,%ymm13,%ymm9,%ymm9

        vpermq  $144,%ymm8,%ymm8
        vpermq  $3,%ymm7,%ymm13
        vblendpd        $1,%ymm13,%ymm8,%ymm8

        vpermq  $144,%ymm7,%ymm7
        vpermq  $3,%ymm6,%ymm13
        vblendpd        $1,%ymm13,%ymm7,%ymm7

        vpermq  $144,%ymm6,%ymm6
        vpermq  $3,%ymm5,%ymm13
        vblendpd        $1,%ymm13,%ymm6,%ymm6

        vpermq  $144,%ymm5,%ymm5
        vpermq  $3,%ymm4,%ymm13
        vblendpd        $1,%ymm13,%ymm5,%ymm5

        vpermq  $144,%ymm4,%ymm4
        vpermq  $3,%ymm3,%ymm13
        vblendpd        $1,%ymm13,%ymm4,%ymm4

        vpermq  $144,%ymm3,%ymm3
        vpand   .Lhigh64x3(%rip),%ymm3,%ymm3

        vmovupd %ymm3,320(%rsp)
        vmovupd %ymm4,352(%rsp)
        vmovupd %ymm5,384(%rsp)
        vmovupd %ymm6,416(%rsp)
        vmovupd %ymm7,448(%rsp)
        vmovupd %ymm8,480(%rsp)
        vmovupd %ymm9,512(%rsp)
        vmovupd %ymm10,544(%rsp)
        vmovupd %ymm11,576(%rsp)
        vmovupd %ymm12,608(%rsp)

        vmovupd 0(%rsp),%ymm3
        vmovupd 32(%rsp),%ymm4
        vmovupd 64(%rsp),%ymm5
        vmovupd 96(%rsp),%ymm6
        vmovupd 128(%rsp),%ymm7
        vmovupd 160(%rsp),%ymm8
        vmovupd 192(%rsp),%ymm9
        vmovupd 224(%rsp),%ymm10
        vmovupd 256(%rsp),%ymm11
        vmovupd 288(%rsp),%ymm12


        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9
        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12


        vpaddq  320(%rsp),%ymm3,%ymm3
        vpaddq  352(%rsp),%ymm4,%ymm4
        vpaddq  384(%rsp),%ymm5,%ymm5
        vpaddq  416(%rsp),%ymm6,%ymm6
        vpaddq  448(%rsp),%ymm7,%ymm7
        vpaddq  480(%rsp),%ymm8,%ymm8
        vpaddq  512(%rsp),%ymm9,%ymm9
        vpaddq  544(%rsp),%ymm10,%ymm10
        vpaddq  576(%rsp),%ymm11,%ymm11
        vpaddq  608(%rsp),%ymm12,%ymm12

        leaq    640(%rsp),%rsp



        vpcmpgtq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r14d
        vpcmpgtq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r13d
        shlb    $4,%r13b
        orb     %r13b,%r14b

        vpcmpgtq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r13d
        vpcmpgtq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%r12d
        shlb    $4,%r12b
        orb     %r12b,%r13b

        vpcmpgtq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%r12d
        vpcmpgtq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%r11d
        shlb    $4,%r11b
        orb     %r11b,%r12b

        vpcmpgtq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%r11d
        vpcmpgtq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%r10d
        shlb    $4,%r10b
        orb     %r10b,%r11b

        vpcmpgtq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%r10d
        vpcmpgtq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%r9d
        shlb    $4,%r9b
        orb     %r9b,%r10b

        addb    %r14b,%r14b
        adcb    %r13b,%r13b
        adcb    %r12b,%r12b
        adcb    %r11b,%r11b
        adcb    %r10b,%r10b


        vpcmpeqq        .Lmask52x4(%rip),%ymm3,%ymm13
        vmovmskpd       %ymm13,%r9d
        vpcmpeqq        .Lmask52x4(%rip),%ymm4,%ymm13
        vmovmskpd       %ymm13,%r8d
        shlb    $4,%r8b
        orb     %r8b,%r9b

        vpcmpeqq        .Lmask52x4(%rip),%ymm5,%ymm13
        vmovmskpd       %ymm13,%r8d
        vpcmpeqq        .Lmask52x4(%rip),%ymm6,%ymm13
        vmovmskpd       %ymm13,%edx
        shlb    $4,%dl
        orb     %dl,%r8b

        vpcmpeqq        .Lmask52x4(%rip),%ymm7,%ymm13
        vmovmskpd       %ymm13,%edx
        vpcmpeqq        .Lmask52x4(%rip),%ymm8,%ymm13
        vmovmskpd       %ymm13,%ecx
        shlb    $4,%cl
        orb     %cl,%dl

        vpcmpeqq        .Lmask52x4(%rip),%ymm9,%ymm13
        vmovmskpd       %ymm13,%ecx
        vpcmpeqq        .Lmask52x4(%rip),%ymm10,%ymm13
        vmovmskpd       %ymm13,%ebx
        shlb    $4,%bl
        orb     %bl,%cl

        vpcmpeqq        .Lmask52x4(%rip),%ymm11,%ymm13
        vmovmskpd       %ymm13,%ebx
        vpcmpeqq        .Lmask52x4(%rip),%ymm12,%ymm13
        vmovmskpd       %ymm13,%eax
        shlb    $4,%al
        orb     %al,%bl

        addb    %r9b,%r14b
        adcb    %r8b,%r13b
        adcb    %dl,%r12b
        adcb    %cl,%r11b
        adcb    %bl,%r10b

        xorb    %r9b,%r14b
        xorb    %r8b,%r13b
        xorb    %dl,%r12b
        xorb    %cl,%r11b
        xorb    %bl,%r10b

        pushq   %r9
        pushq   %r8

        leaq    .Lkmasklut(%rip),%r8

        movb    %r14b,%r9b
        andq    $0xf,%r14
        vpsubq  .Lmask52x4(%rip),%ymm3,%ymm13
        shlq    $5,%r14
        vmovapd (%r8,%r14,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm3,%ymm3

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm4,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm4,%ymm4

        movb    %r13b,%r9b
        andq    $0xf,%r13
        vpsubq  .Lmask52x4(%rip),%ymm5,%ymm13
        shlq    $5,%r13
        vmovapd (%r8,%r13,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm5,%ymm5

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm6,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm6,%ymm6

        movb    %r12b,%r9b
        andq    $0xf,%r12
        vpsubq  .Lmask52x4(%rip),%ymm7,%ymm13
        shlq    $5,%r12
        vmovapd (%r8,%r12,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm7,%ymm7

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm8,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm8,%ymm8

        movb    %r11b,%r9b
        andq    $0xf,%r11
        vpsubq  .Lmask52x4(%rip),%ymm9,%ymm13
        shlq    $5,%r11
        vmovapd (%r8,%r11,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm9,%ymm9

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm10,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm10,%ymm10

        movb    %r10b,%r9b
        andq    $0xf,%r10
        vpsubq  .Lmask52x4(%rip),%ymm11,%ymm13
        shlq    $5,%r10
        vmovapd (%r8,%r10,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm11,%ymm11

        shrb    $4,%r9b
        andq    $0xf,%r9
        vpsubq  .Lmask52x4(%rip),%ymm12,%ymm13
        shlq    $5,%r9
        vmovapd (%r8,%r9,1),%ymm14
        vblendvpd       %ymm14,%ymm13,%ymm12,%ymm12

        popq    %r8
        popq    %r9

        vpand   .Lmask52x4(%rip),%ymm3,%ymm3
        vpand   .Lmask52x4(%rip),%ymm4,%ymm4
        vpand   .Lmask52x4(%rip),%ymm5,%ymm5
        vpand   .Lmask52x4(%rip),%ymm6,%ymm6
        vpand   .Lmask52x4(%rip),%ymm7,%ymm7
        vpand   .Lmask52x4(%rip),%ymm8,%ymm8
        vpand   .Lmask52x4(%rip),%ymm9,%ymm9

        vpand   .Lmask52x4(%rip),%ymm10,%ymm10
        vpand   .Lmask52x4(%rip),%ymm11,%ymm11
        vpand   .Lmask52x4(%rip),%ymm12,%ymm12

        vmovdqu %ymm3,320(%rdi)
        vmovdqu %ymm4,352(%rdi)
        vmovdqu %ymm5,384(%rdi)
        vmovdqu %ymm6,416(%rdi)
        vmovdqu %ymm7,448(%rdi)
        vmovdqu %ymm8,480(%rdi)
        vmovdqu %ymm9,512(%rdi)
        vmovdqu %ymm10,544(%rdi)
        vmovdqu %ymm11,576(%rdi)
        vmovdqu %ymm12,608(%rdi)

        vzeroupper
        leaq    (%rsp),%rax
.cfi_def_cfa_register   %rax
        movq    0(%rax),%r15
.cfi_restore    %r15
        movq    8(%rax),%r14
.cfi_restore    %r14
        movq    16(%rax),%r13
.cfi_restore    %r13
        movq    24(%rax),%r12
.cfi_restore    %r12
        movq    32(%rax),%rbp
.cfi_restore    %rbp
        movq    40(%rax),%rbx
.cfi_restore    %rbx
        leaq    48(%rax),%rsp
.cfi_def_cfa    %rsp,8
.Lossl_rsaz_amm52x40_x2_avxifma256_epilogue:
        .byte   0xf3,0xc3
.cfi_endproc
.size   ossl_rsaz_amm52x40_x2_avxifma256, .-ossl_rsaz_amm52x40_x2_avxifma256
.text

.align  32
.globl  ossl_extract_multiplier_2x40_win5_avx
.type   ossl_extract_multiplier_2x40_win5_avx,@function
ossl_extract_multiplier_2x40_win5_avx:
.cfi_startproc
.byte   243,15,30,250
        vmovapd .Lones(%rip),%ymm14
        vmovq   %rdx,%xmm10
        vpbroadcastq    %xmm10,%ymm12
        vmovq   %rcx,%xmm10
        vpbroadcastq    %xmm10,%ymm13
        leaq    20480(%rsi),%rax


        movq    %rsi,%r10


        vpxor   %xmm0,%xmm0,%xmm0
        vmovapd %ymm0,%ymm1
        vmovapd %ymm0,%ymm2
        vmovapd %ymm0,%ymm3
        vmovapd %ymm0,%ymm4
        vmovapd %ymm0,%ymm5
        vmovapd %ymm0,%ymm6
        vmovapd %ymm0,%ymm7
        vmovapd %ymm0,%ymm8
        vmovapd %ymm0,%ymm9
        vpxor   %ymm11,%ymm11,%ymm11
.align  32
.Lloop_0:
        vpcmpeqq        %ymm11,%ymm12,%ymm15
        vmovdqu 0(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm0,%ymm0
        vmovdqu 32(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm1,%ymm1
        vmovdqu 64(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm2,%ymm2
        vmovdqu 96(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm3,%ymm3
        vmovdqu 128(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm4,%ymm4
        vmovdqu 160(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm5,%ymm5
        vmovdqu 192(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm6,%ymm6
        vmovdqu 224(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm7,%ymm7
        vmovdqu 256(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm8,%ymm8
        vmovdqu 288(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm9,%ymm9
        vpaddq  %ymm14,%ymm11,%ymm11
        addq    $640,%rsi
        cmpq    %rsi,%rax
        jne     .Lloop_0
        vmovdqu %ymm0,0(%rdi)
        vmovdqu %ymm1,32(%rdi)
        vmovdqu %ymm2,64(%rdi)
        vmovdqu %ymm3,96(%rdi)
        vmovdqu %ymm4,128(%rdi)
        vmovdqu %ymm5,160(%rdi)
        vmovdqu %ymm6,192(%rdi)
        vmovdqu %ymm7,224(%rdi)
        vmovdqu %ymm8,256(%rdi)
        vmovdqu %ymm9,288(%rdi)
        movq    %r10,%rsi
        vpxor   %ymm11,%ymm11,%ymm11
.align  32
.Lloop_320:
        vpcmpeqq        %ymm11,%ymm13,%ymm15
        vmovdqu 320(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm0,%ymm0
        vmovdqu 352(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm1,%ymm1
        vmovdqu 384(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm2,%ymm2
        vmovdqu 416(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm3,%ymm3
        vmovdqu 448(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm4,%ymm4
        vmovdqu 480(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm5,%ymm5
        vmovdqu 512(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm6,%ymm6
        vmovdqu 544(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm7,%ymm7
        vmovdqu 576(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm8,%ymm8
        vmovdqu 608(%rsi),%ymm10

        vblendvpd       %ymm15,%ymm10,%ymm9,%ymm9
        vpaddq  %ymm14,%ymm11,%ymm11
        addq    $640,%rsi
        cmpq    %rsi,%rax
        jne     .Lloop_320
        vmovdqu %ymm0,320(%rdi)
        vmovdqu %ymm1,352(%rdi)
        vmovdqu %ymm2,384(%rdi)
        vmovdqu %ymm3,416(%rdi)
        vmovdqu %ymm4,448(%rdi)
        vmovdqu %ymm5,480(%rdi)
        vmovdqu %ymm6,512(%rdi)
        vmovdqu %ymm7,544(%rdi)
        vmovdqu %ymm8,576(%rdi)
        vmovdqu %ymm9,608(%rdi)

        .byte   0xf3,0xc3
.cfi_endproc
.size   ossl_extract_multiplier_2x40_win5_avx, .-ossl_extract_multiplier_2x40_win5_avx
.section        .rodata
.align  32
.Lones:
.quad   1,1,1,1
.Lzeros:
.quad   0,0,0,0
        .section ".note.gnu.property", "a"
        .p2align 3
        .long 1f - 0f
        .long 4f - 1f
        .long 5
0:
        # "GNU" encoded with .byte, since .asciz isn't supported
        # on Solaris.
        .byte 0x47
        .byte 0x4e
        .byte 0x55
        .byte 0
1:
        .p2align 3
        .long 0xc0000002
        .long 3f - 2f
2:
        .long 3
3:
        .p2align 3
4:
